Shortcuts

alt + - will add <- an assignment operator Shift + ctrl + c to add # in front of a line
---- or four dashes for a header, so it is easy to navigate through the script
command/Ctrl + Shift + m for pipe %>%
Ctrl+ Alt + i for new code chunk

Rmd syntax

Plain text
End a line with two spaces to start a new paragraph.
For italics *text* or _text_ (without gap *text*)
For bold **text**(without gap **text**)
superscript2 superscript^2^
Strikethrough ~Strikethrough~

Adding web link to a text: link to rstudio e.g., [text] and without gap (paste link with http://xyz.com)

Logical operations

1==1 # equal
1!=3 # unequal
13<14 # 13 smaller than 14
14>13 # 14 bigger than 13
12>=0 # 12 greater or equal to zero
12<=3 # 12 smaller or equal to zero

Creating data.frame

i.e. family

name <- c("saneesh", "sanusha", "appu", "kishan")
weight <- c(63, 48, 20, NA)
height <- c(164, 150, NA, 75)
family <- data.frame(name, weight, height)
family %>%
    as_tibble()
same.family <- data.frame(name = c("saneesh", "sanusha", "appu", "kishan"), weight = c(63,
    48, 20, NA), height = c(164, 150, NA, 75))

Abundance

Community <- c(rep("A", 3), rep("B", 3))
Species <- rep(c("X", "Y", "Z"), 2)
Count <- c(100, 0, 50, 50, 30, 40)

df <- data.frame(Community, Species, Count)

# abundance refers to the total number of individuals of different species
# within each community. It represents the quantity or total count of
# individuals present.

abundance <- df %>%
    group_by(Community) %>%
    summarise(Total_abundance = sum(Count))

# Species richness, on the other hand, refers to the total number of unique
# species present in each community. It represents the diversity of species
# within a community.

richness <- df %>%
    group_by(Community) %>%
    filter(Count > 0) %>%
    distinct(Species) %>%
    summarise(Richness = n())

Proportion

tree <- c("a", "b", "c", "d")
treatment <- c("fire", "no_fire")


data.frame(tree = sample(tree, 20, replace = T), treatment = sample(treatment, 20,
    replace = T), flower = rbinom(20, 3, prob = 0.3)) %>%
    group_by(tree, treatment, flower) %>%
    summarise(count = n(), .groups = "drop") %>%
    mutate(prop = count/sum(count)) %>%
    ggplot(aes(x = flower, y = prop, fill = tree)) + geom_bar(stat = "identity",
    position = "dodge") + facet_wrap(~treatment)

Zero count

library(dplyr)

df <- data.frame(tree = c(rep("a", 4), rep("b", 4)), seeds = c(0, 0, 0, 1, 2, 3,
    0, 0))

zero_counts <- df %>%
    group_by(tree) %>%
    summarise(zero_count = sum(seeds == 0))

print(zero_counts)
## # A tibble: 2 × 2
##   tree  zero_count
##   <chr>      <int>
## 1 a              3
## 2 b              2

Data frame with unequal values 10 and 8

library(tidyverse)
data <- data.frame(sex = c(rep("female", 10), rep("male", 8)), score = c(rnorm(n = 10,
    mean = 7.56, sd = 1.978), rnorm(n = 8, mean = 7.75, sd = 1.631)))

data %>%
    head(5)
data %>%
    group_by(sex) %>%
    summarise(score = n()) %>%
    mutate(freq = score/sum(score) * 100)

Name the unnamed first column of a data.frame

# newdf <- rownames_to_column(df, var = 'name to an unnamed')

Creating a tibble

library(tidyverse)
years <- tribble(~Location, ~Year, ~Month, ~Day, ~Lenght, "Sydney", 2000, 9, 15,
    12.1213, "Athens", 2004, 8, 13, 12.1212, "Beijing", 2008, 8, 8, 13.212, "London",
    2012, 7, 27, 13.1212, "Rio de Janeiro", 2016, 8, 5, 65)

# write.csv(years, file = 'years.csv', row.names = FALSE) # without index use
# row.names = FALSE

tabyl

tabyl

mutate round

# run previous code chunk
library(gt)
years %>%
    gt()
Location Year Month Day Lenght
Sydney 2000 9 15 12.1213
Athens 2004 8 13 12.1212
Beijing 2008 8 8 13.2120
London 2012 7 27 13.1212
Rio de Janeiro 2016 8 5 65.0000
years %>%
    mutate(Lenght = round(Lenght, 2)) %>%
    gt() %>%
    tab_options(column_labels.font.size = 11, column_labels.font.weight = "bold",
        table.font.size = 10, ) %>%
    opt_table_outline(style = "solid", width = px(2))
Location Year Month Day Lenght
Sydney 2000 9 15 12.12
Athens 2004 8 13 12.12
Beijing 2008 8 8 13.21
London 2012 7 27 13.12
Rio de Janeiro 2016 8 5 65.00
library(janitor)
## 
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
data <- data.frame(HairEyeColor)

data %>%
    tabyl(Hair, Eye) %>%
    adorn_percentages("row") %>%
    adorn_pct_formatting(digits = 2) %>%
    adorn_ns() %>%
    knitr::kable()
Hair Brown Blue Hazel Green
Black 25.00% (2) 25.00% (2) 25.00% (2) 25.00% (2)
Brown 25.00% (2) 25.00% (2) 25.00% (2) 25.00% (2)
Red 25.00% (2) 25.00% (2) 25.00% (2) 25.00% (2)
Blond 25.00% (2) 25.00% (2) 25.00% (2) 25.00% (2)

Data cleaning

Find NAs

# identify location of NAs in vector
which(is.na(family))
## [1]  8 11
colSums(is.na(family))
##   name weight height 
##      0      1      1

Replace na

mat <- matrix(sample(c(NA, 1:5), 50, replace = TRUE), 5)
df <- as.data.frame(mat)
df %>%
    replace(is.na(.), 0) %>%
    View()

Drop na

see spread & gather

Clean names

# install.packages('janitor')
library(janitor)

id <- (c(1, 1, 2, 2, 3, 3))
Country <- c("Angola", "Angola", "Botswana", "Botswana", "Zimbabwe", "Zimbabwe")
year <- c("2006", "2007", "2008", "2009", "2010", "2006")
bank.ratio <- c(24, 25, 38, 34, 42, 49)
Reserve.ratio <- c(77, 59, 64, 65, 57, 86)
broad.money <- c(163, 188, 317, 361, 150, 288)


bank <- data.frame(id, Country, year, bank.ratio, Reserve.ratio, broad.money)

bank <- bank %>%
    clean_names()  # replaced . with _

glimpse(bank)
## Rows: 6
## Columns: 6
## $ id            <dbl> 1, 1, 2, 2, 3, 3
## $ country       <chr> "Angola", "Angola", "Botswana", "Botswana", "Zimbabwe", …
## $ year          <chr> "2006", "2007", "2008", "2009", "2010", "2006"
## $ bank_ratio    <dbl> 24, 25, 38, 34, 42, 49
## $ reserve_ratio <dbl> 77, 59, 64, 65, 57, 86
## $ broad_money   <dbl> 163, 188, 317, 361, 150, 288

Filter

filter bank data frame below such that it retains a country if a given id is satisfied e.g. filtering a data frame that has countries with id 1 and 2 only

bank %>%
    filter(id %in% c(1, 2)) %>%
    as_tibble()

summarise fund available with each countries

bank %>%
    group_by(country) %>%
    summarise(fund = sum(broad_money)) %>%
    as_tibble()

Rename column

column: new name= old name

iris %>%
    rename(S.len = Sepal.Length, Sp. = Species) %>%
    head(3)

Rename to lower

iris %>%
    rename_with(tolower) %>%
    head(3)

Rename to lower specific columns

iris %>%
    select_at(vars(Species, Petal.Length), tolower) %>%
    head(3)

Add name to a nameless column

library(tidyverse)
mtcars <- mtcars %>%
    as_tibble(rownames = "cars")

Add column

library(tibble)
iris %>%
    add_column(ob_no = 1:150) %>%
    head(5)
iris %>%
    as_tibble() %>%
    head(3)
library(gapminder)
summary(gapminder)
##         country        continent        year         lifeExp     
##  Afghanistan:  12   Africa  :624   Min.   :1952   Min.   :23.60  
##  Albania    :  12   Americas:300   1st Qu.:1966   1st Qu.:48.20  
##  Algeria    :  12   Asia    :396   Median :1980   Median :60.71  
##  Angola     :  12   Europe  :360   Mean   :1980   Mean   :59.47  
##  Argentina  :  12   Oceania : 24   3rd Qu.:1993   3rd Qu.:70.85  
##  Australia  :  12                  Max.   :2007   Max.   :82.60  
##  (Other)    :1632                                                
##       pop              gdpPercap       
##  Min.   :6.001e+04   Min.   :   241.2  
##  1st Qu.:2.794e+06   1st Qu.:  1202.1  
##  Median :7.024e+06   Median :  3531.8  
##  Mean   :2.960e+07   Mean   :  7215.3  
##  3rd Qu.:1.959e+07   3rd Qu.:  9325.5  
##  Max.   :1.319e+09   Max.   :113523.1  
## 
str(gapminder)
## tibble [1,704 × 6] (S3: tbl_df/tbl/data.frame)
##  $ country  : Factor w/ 142 levels "Afghanistan",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ continent: Factor w/ 5 levels "Africa","Americas",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ year     : int [1:1704] 1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 ...
##  $ lifeExp  : num [1:1704] 28.8 30.3 32 34 36.1 ...
##  $ pop      : int [1:1704] 8425333 9240934 10267083 11537966 13079460 14880372 12881816 13867957 16317921 22227415 ...
##  $ gdpPercap: num [1:1704] 779 821 853 836 740 ...

Re-code observation (recode)

change name of observation— mutate (variable=recode (variable, ‘old name’=‘new name’)))

gapminder %>%
    mutate(country = recode(country, India = "IND")) %>%
    filter(country == "IND") %>%
    head(3)

Convert numeric values to a binary (Yes/No)

To convert all non-zero numeric values to “Yes” to convert zero values to “No”

df <- data.frame(name = c("saneesh", "sanusha", "appu", "jaru"), sex = c(2, 0, 5,
    8))
df
# convert numeric values to 'Yes'
df %>%
    mutate(sex1 = ifelse(sex != 0, "Yes", "No"))
df %>%
    mutate(sex1 = ifelse(sex != 0, "Male", "Female"))

The ifelse() function is used to check whether each value in the “sex” column is non-zero. If it is, the value is replaced with “Yes”. If not, the value is replaced with “No”.

Select

gapminder %>%
    select(year, country, gdpPercap) %>%
    head(3)
msleep %>%
    select(starts_with("sleep")) %>%
    head(3)

Do not select

iris %>%
    select(-Sepal.Length, -Species) %>%
    head(3)
iris %>%
    select(-c(Sepal.Length)) %>%
    head(3)
iris %>%
    select(!Sepal.Length) %>%
    head(3)

ends_with

iris %>%
    select(ends_with("length")) %>%
    head(3)

starts_with

iris %>%
    select(starts_with("Sepal")) %>%
    head(3)

Filter

gapminder %>%
    select(year, country, lifeExp) %>%
    filter(country == "Eritrea", year > 1950) %>%
    head(3)
gapminder %>%
    filter(country == "Canada") %>%
    head(3)  # from gapminder data filter country Canada and show only 2 observations

Except

gapminder %>%
    filter(country != "Oman") %>%
    head(3)  # from gapminder data filter all the other countries except Oman

Omit

iris %>%
    filter(Species != "setosa") %>%
    glimpse()
## Rows: 100
## Columns: 5
## $ Sepal.Length <dbl> 7.0, 6.4, 6.9, 5.5, 6.5, 5.7, 6.3, 4.9, 6.6, 5.2, 5.0, 5.…
## $ Sepal.Width  <dbl> 3.2, 3.2, 3.1, 2.3, 2.8, 2.8, 3.3, 2.4, 2.9, 2.7, 2.0, 3.…
## $ Petal.Length <dbl> 4.7, 4.5, 4.9, 4.0, 4.6, 4.5, 4.7, 3.3, 4.6, 3.9, 3.5, 4.…
## $ Petal.Width  <dbl> 1.4, 1.5, 1.5, 1.3, 1.5, 1.3, 1.6, 1.0, 1.3, 1.4, 1.0, 1.…
## $ Species      <fct> versicolor, versicolor, versicolor, versicolor, versicolo…

Filter multiple

iris %>%
    select(Species) %>%
    distinct(Species) %>%
    filter(Species %in% c("setosa", "versicolor")) %>%
    head(3)

using a vector, save the names as a vector and give it to %in%

target <- c("Hungary", "Iceland", "Mongolia")
gapminder %>%
    filter(country %in% target) %>%
    head(3)
friends <- data.frame(Names = c("Saneesh", "Appu", "Shruti", "Aradhana", "Arathi",
    "James Bond"), age = c(40, 9, 25, 25, 25, 50))
# data frame is friends columns in friends are Names, Age, Height, etc.  Column
# Name have 'Saneesh', 'Appu', 'Shruti', 'Aradhana', 'Arathi', 'James Bond' We
# want to filter information related to Sanees and James Bond only, so we
# created a vector with these names in it.

target <- c("Appu", "James Bond")  #and then

friends %>%
    filter(Names %in% target)
# or
friends %>%
    filter(Names == "Appu" | Names == "James Bond")
# or
friends %>%
    filter(Names %in% c("Appu", "James Bond"))

omit multiple

iris %>%
    filter(!Species %in% c("setosa", "versicolor")) %>%
    glimpse()
## Rows: 50
## Columns: 5
## $ Sepal.Length <dbl> 6.3, 5.8, 7.1, 6.3, 6.5, 7.6, 4.9, 7.3, 6.7, 7.2, 6.5, 6.…
## $ Sepal.Width  <dbl> 3.3, 2.7, 3.0, 2.9, 3.0, 3.0, 2.5, 2.9, 2.5, 3.6, 3.2, 2.…
## $ Petal.Length <dbl> 6.0, 5.1, 5.9, 5.6, 5.8, 6.6, 4.5, 6.3, 5.8, 6.1, 5.1, 5.…
## $ Petal.Width  <dbl> 2.5, 1.9, 2.1, 1.8, 2.2, 2.1, 1.7, 1.8, 1.8, 2.5, 2.0, 1.…
## $ Species      <fct> virginica, virginica, virginica, virginica, virginica, vi…

filter between

iris %>%
    filter(Petal.Width >= 2 & Petal.Width <= 5) %>%
    glimpse()
## Rows: 29
## Columns: 5
## $ Sepal.Length <dbl> 6.3, 7.1, 6.5, 7.6, 7.2, 6.5, 6.8, 5.7, 5.8, 6.4, 7.7, 7.…
## $ Sepal.Width  <dbl> 3.3, 3.0, 3.0, 3.0, 3.6, 3.2, 3.0, 2.5, 2.8, 3.2, 3.8, 2.…
## $ Petal.Length <dbl> 6.0, 5.9, 5.8, 6.6, 6.1, 5.1, 5.5, 5.0, 5.1, 5.3, 6.7, 6.…
## $ Petal.Width  <dbl> 2.5, 2.1, 2.2, 2.1, 2.5, 2.0, 2.1, 2.0, 2.4, 2.3, 2.2, 2.…
## $ Species      <fct> virginica, virginica, virginica, virginica, virginica, vi…

filter matching

library(tidyverse)
library(dplyr)
mtcars <- mtcars %>%
    rownames_to_column
mtcars %>%
    filter(str_detect(rowname, "Merc")) %>%
    head(3)  # filter only 'Merc'
mtcars %>%
    filter(!str_detect(rowname, "Merc")) %>%
    head(3)  # filter everything except 'Merc'

filter distinct

To remove or exclude all entries in the “name” column of your data frame that have 1 in the “pref” column, you can use the filter() and distinct() functions from the dplyr

df <- data.frame(name = c("a", "a", "b", "c", "d", "a", "d"), pref = c(1, 2, 2, 1,
    3, 4, 1))

df
df %>%
    group_by(name) %>%
    filter(!any(pref == 1)) %>%
    ungroup()

or, if you have multiple rows with the same name but different values in the “pref” column, the code above will remove all rows with that name if any of them have 1 in the “pref” column. If you want to remove only the rows with 1 in the “pref” column, but keep the other rows with the same name, you can modify the code as follows:

df %>%
    group_by(name) %>%
    filter(!any(pref == 1)) %>%
    ungroup()

Pull

iris %>%
    pull(Species) %>%
    head(3)  # returns vector values
## [1] setosa setosa setosa
## Levels: setosa versicolor virginica
iris %>%
    select(Species) %>%
    head(3)  # returns a table with one column
iris %>%
    select(everything()) %>%
    head(3)

multiple conditions

gapminder %>%
    filter(country == "Oman" & year > 1980 & year <= 2000) %>%
    head(4)
gapminder %>%
    select(country, year) %>%
    filter(year >= 1980, country == "India" | country == "Oman" | country == "Canada") %>%
    head(4)
gapminder %>%
    filter(country != "Oman") %>%
    head(3)  # from gapminder data filter all the other countires exept Oman

drop

gapminder %>%
    select(-year, -pop) %>%
    head(5)

group by & summarise

gapminder %>%
    filter(year == 2007) %>%
    group_by(country) %>%
    summarise(meanLE = mean(lifeExp)) %>%
    arrange(meanLE, decreasing = TRUE) %>%
    head(3)
gapminder %>%
    group_by(country) %>%
    summarise(minLE = min(lifeExp)) %>%
    arrange(minLE, decreasing = FALSE) %>%
    head(3)

grouped by continent, then summarise two things, first n=n() number of rows in which each continent are or the size of each group, then the mean of the mean of the lifeExp variable.

gapminder %>%
    group_by(continent) %>%
    summarise(n = n(), meanLife = mean(lifeExp))
gapminder %>%
    group_by(continent) %>%
    summarise(PopConti = sum(pop))
pets <- data.frame(names = c(rep("saneesh", 3), rep("appu", 2), "sanusha"), pet = c(rep("dog",
    3), rep("cat", 2), "tiger"), number = c(2, 2, 5, 7, 8, 1), size = c(rep("medium",
    2), rep("small", 3), "big"))

pets
pets %>%
    group_by(pet, size) %>%
    summarise(totalpet = sum(number))
## `summarise()` has grouped output by 'pet'. You can override using the `.groups`
## argument.

grouping with conditions

If we want make a ‘new column’ with values from ‘number’ only if ‘sp.name’ ‘a’ or any other values has the following responses ‘young’ and ‘adult’, if not enter 0 in the ‘new column’.

You need to have groups with any of stage == “young” & “adult” (group level conditions) and stage == “adult” (row-level condition):

summarise

library(tidyverse)
plot <- c(rep(1, 2), rep(2, 4), rep(3, 3))
bird <- c("a", "b", "a", "b", "c", "d", "a", "b", "c")
area <- c(rep(10, 2), rep(5, 4), rep(15, 3))

birdlist <- data.frame(plot, bird, area)
birdlist
# summarize the following data frame to a summary table.  option 1
birdlist %>%
    group_by(plot) %>%
    summarise(bird = n(), area = unique(area))
# option 2
birdlist %>%
    count(plot, area, name = "bird")
gapminder %>%
    summarise(mean(lifeExp))
gapminder %>%
    summarise(range(lifeExp))
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
##   always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
gapminder %>%
    filter(country == "India") %>%
    group_by(country) %>%
    summarise(GDPmax = max(gdpPercap), GDPmin = min(gdpPercap), GDPmean = mean(gdpPercap))

remove duplicates from a column and summarise

df <- data.frame(name = c("a", "a", "b", "c"), seedling = c(1, 0, 1, 0), adult = c(0,
    5, 0, 1))

df_new <- df %>%
    group_by(name) %>%
    summarise(seedling = max(seedling, 0), adult = max(adult, 0)) %>%
    ungroup()

find and remove duplicates from a dataframe

library(dplyr)
library(hablar)
## 
## Attaching package: 'hablar'
## The following object is masked from 'package:forcats':
## 
##     fct
## The following object is masked from 'package:dplyr':
## 
##     na_if
## The following object is masked from 'package:tibble':
## 
##     num
df <- tibble(a = c(1, 1, "a", 2, 2, 2, 4), b = c("a", "a", 1, "b", "b", "b", "c"))
df %>%
    print()
## # A tibble: 7 × 2
##   a     b    
##   <chr> <chr>
## 1 1     a    
## 2 1     a    
## 3 a     1    
## 4 2     b    
## 5 2     b    
## 6 2     b    
## 7 4     c
df %>%
    find_duplicates()
df %>%
    distinct() %>%
    print()
## # A tibble: 4 × 2
##   a     b    
##   <chr> <chr>
## 1 1     a    
## 2 a     1    
## 3 2     b    
## 4 4     c

count/summarize

count name column

iris %>%
    count(Species, name = "how many")
mtcars %>%
    count(am, name = "number") %>%
    as_tibble()
mtcars %>%
    count(gear, name = "no. gear")

New column with paste

library(dplyr)

# Create a data frame with two columns named 'a' and 'b'
df <- data.frame(a = c("red", "blue", "green"), b = c(1, 2, 3))

# Create a new column named 'c' by combining values from 'a' and 'b'
df <- df %>%
    mutate(c = paste(a, b, sep = "_"))

Count birds

plot <- c(rep(1, 2), rep(2, 4), rep(3, 3))
bird <- as.factor(c("a", "b", "a", "b", "c", "d", "a", "b", "c"))
area <- c(rep(10, 2), rep(5, 4), rep(15, 3))

birdlist <- data.frame(plot, bird, area)
birdlist
# birdlist %>%  group_by(plot, area) %>%  mutate(count(bird))


birdlist %>%
  group_by(plot, area) %>%
  summarise(bird = n(), .groups = "drop")
# (dplyr::summarise)like this
# to summarize of a column with reference to two other variables.

count sites

treatment <- c(rep("ab", 2), rep("bgrnf", 8), rep("bgpnf", 4))
site <- c(
  "ab1",
  "ab2",
  rep("bgrnf1", 3),
  rep("bgrnf2", 2),
  "bgrnf3",
  "bgrnf4",
  "bgrnf5",
  rep("bgpnf1", 2),
  rep("bgpnf2", 2)
)
data <- data.frame(treatment, site)

# to find the site per each treatment
data %>% group_by(treatment) %>% count(treatment, name = "#sites")

count within years

year <-  c(rep(2000, 4),
  rep(2001, 4),
  rep(2002, 4)
)
site <- c(rep("a", 3),
  rep("b", 3),
  rep("c", 3),
  rep("d", 3)
)

fire <- c("yes", "no", "yes",
  "yes", "no", "no",
  "yes", "yes", "yes",
  "yes", "yes", "yes")

df <- data.frame(year, site, fire)

df %>%
  group_by(site) %>%
  summarize(
    Burnt_once = sum(fire == "yes" &
      year %in% c(2000, 2001, 2002)) == 1,
    Burnt_twice = sum(fire == "yes" &
      year %in% c(2000, 2001, 2002)) == 2,
    Burnt_thrice = sum(fire == "yes" &
      year %in% c(2000, 2001, 2002)) == 3
  ) %>%  mutate(
    Burnt_once = ifelse(Burnt_once, 1, 0),
    Burnt_twice = ifelse(Burnt_twice, 1, 0),
    Burnt_thrice = ifelse(Burnt_thrice, 1, 0)
  ) %>%  summarise(across(where(is.numeric),    ~ sum(.x,   na.rm = TRUE)))
# df %>%
#   group_by(site) %>%
#   summarize(
#     Burnt_once = sum(fire == "yes" &
#                        year %in% c(2000, 2001, 2002)) == 1, # in these years look for 1 'yes'
#     Burnt_twice = sum(fire == "yes" &
#                         year %in% c(2000, 2001, 2002)) == 2, # in these years look for 2 'yes'
#     Burnt_thrice = sum(fire == "yes" &
#                          year %in% c(2000, 2001, 2002)) == 3 # in these years look for 3 'yes'
#   ) %>% # returns a logical vector
#   mutate(
#     Burnt_once = ifelse(Burnt_once, 1, 0),
#     Burnt_twice = ifelse(Burnt_twice, 1, 0),
#     Burnt_thrice = ifelse(Burnt_thrice, 1, 0)
#   ) %>% # convert logical response to numeric
#   summarise( # summarise data
#     across( # specifycolumns
#       where(is.numeric), # select columns with numeric ones
#       ~ sum( # selected column using the ~ formula notation
#         .x, # for each selected columns
#         na.rm = TRUE))) # remove any missing values before calculating the sum

case when new column

library(dplyr)
library(stringr)
feedback <-
  c("good_book", "good_read", "for knowledge", "adventure")
book <- c("Ramayana", "Bible", "Encyclopedia", "Mbharatha")

df <- data.frame(book, feedback)

df %>%
  mutate(response = case_when(str_starts(feedback, "good") ~ "good")) %>%
  select(book, response) %>% as_tibble()

Case when

names(iris)
## [1] "Sepal.Length" "Sepal.Width"  "Petal.Length" "Petal.Width"  "Species"
iris %>%
    mutate(species.code = case_when(Species == "setosa" ~ 1, Species == "versicolor" ~
        2, Species == "virginica" ~ 3)) %>%
    head()

Use of if else

library(dplyr)

iris %>%
    select(Species) %>%
    slice_sample(n = 10) %>%
    mutate(code = if_else(Species == "setosa", 1, 0)  # you might see different result!
)

Separate text to columns

df <- data.frame(films = c("Spider_man", "James_bond", "Iron_man", "Bat_man"))
df
df1 <- df %>%
    separate(films, c("a", "b"), sep = "([_])")
df1

Unite text

df1 %>%
    unite("names", a:b, remove = FALSE)

Join

df1 <- data.frame(id = c(1:4), films = c("Spider_man", "James_bond", "Iron_man",
    "Bat_man"))

df2 <- data.frame(id = c(1:4), country = rep("us", 4))
df3 <- left_join(df1, df2, by = "id")

Spread & gather

We are making a wide format from long format in the first example. The second example is to make a long format from wide.

# the following is already in long format
classdata <- data.frame(
  studentname = c("captian", "ant", "james", "spider", "tony", "bat", "wonder"),
  subject = c("math", "his", "math", "geo", "his", "geo", "math"),
  grade = c("A+", "B", "B", "A+", "C", "B+", "C")
)

classdata %>% head()
wide.class <- spread(classdata,  subject,  grade)
# classdata= name of the data frame
# subject= new columns to be made
# grade= values to go into new columns


head(wide.class)
gather(wide.class, subject,  grade, geo, his, math) %>%
  drop_na()
# wide.class= name of the data frame
# subject= name of the column to put data into
# grade= name of the column to put value into
# geo, his, math= from where values has to be gathered

Join rows

bind rows

df1 <-
  data.frame(
    id = c(1:4),
    films = c("Spider_man", "James_bond", "Iron_man", "Bat_man")
  )
df2 <-
  data.frame(
    id = c(5:8),
    films = c("King Cong", "Silence of the lambs", "Intersteller", "Gravity")
  )
dplyr::bind_rows(df1, df2)

Across

For multiple variables

library(tidyverse)
srno <- c(1:2)
film <- c("arabica", "robust")
rate <- c("good", "better")
lang_Eng <- c("yes", "yes")

films <- data.frame(srno, film, rate, lang_Eng)

str(films)
## 'data.frame':    2 obs. of  4 variables:
##  $ srno    : int  1 2
##  $ film    : chr  "arabica" "robust"
##  $ rate    : chr  "good" "better"
##  $ lang_Eng: chr  "yes" "yes"
films <- films %>%
  mutate(across(c(rate, lang_Eng), as.factor))

str(films)
## 'data.frame':    2 obs. of  4 variables:
##  $ srno    : int  1 2
##  $ film    : chr  "arabica" "robust"
##  $ rate    : Factor w/ 2 levels "better","good": 2 1
##  $ lang_Eng: Factor w/ 1 level "yes": 1 1

Everthing

Select a key variable and everything or every other columns.

library(gapminder)
gapminder %>%
    select(pop, everything()) %>%
    head(3)

toupper and lower

library(stringr)

data <- data.frame(Dose.Cm = c("d1", "D2", "D3"), Len.km = c("High", "low", "Low"))
glimpse(data)
## Rows: 3
## Columns: 2
## $ Dose.Cm <chr> "d1", "D2", "D3"
## $ Len.km  <chr> "High", "low", "Low"
data %>%
    mutate(Dose.Cm = tolower(Dose.Cm), Len.km = toupper(Len.km))

factor

data <- data.frame(Dose.Cm = c("d1", "D2", "D3"), Len.km = c("high", "low", "medium"))
data <- data %>%
    mutate(len = as.factor(Len.km))

glimpse(data)
## Rows: 3
## Columns: 3
## $ Dose.Cm <chr> "d1", "D2", "D3"
## $ Len.km  <chr> "high", "low", "medium"
## $ len     <fct> high, low, medium

change order of factor

data %>%
    mutate(len = fct_relevel(len, c("low", "medium", "high")))

parse_number

This drops any non-numeric characters before or after the first number. The grouping mark specified by the locale is ignored inside the number.

library(tidyverse)
class <- c("8th", "9th", "10th")
students <- c("25-30", "35-41", "21-28")
school <- data.frame(class, students)
school
glimpse(school)  # notice students is a binned variable it is a not a numeric.
## Rows: 3
## Columns: 2
## $ class    <chr> "8th", "9th", "10th"
## $ students <chr> "25-30", "35-41", "21-28"
school %>%
    mutate(students = parse_number(students)) %>%
    glimpse()
## Rows: 3
## Columns: 2
## $ class    <chr> "8th", "9th", "10th"
## $ students <dbl> 25, 35, 21
school %>%
    mutate(students = parse_number(students))
# now students because number with first value of the column

pivot longer

library(tidyverse)

rawdata <- data.frame(species_1 = rnorm(n = 40, mean = 300, sd = 18.5), species_2 = rnorm(40,
    305, 16.7))
data <- pivot_longer(data = rawdata, cols = species_1:species_2, names_to = "species",
    values_to = "weight")

Pivot wider

library(tidyverse)

df <- data.frame(name = c("saneesh", "sanusha", "appu", "jaru"), fav.no = c(11, 7,
    20, 21), animal = c("human", "human", "human", "dog"))

df %>%
    pivot_wider(names_from = "animal", values_from = "fav.no")
# but when we have similar names in the grouping column
df1 <- data.frame(name = c("saneesh", "sanusha", "appu", "jaru", "saneesh"), fav.no = c(11,
    7, 20, 21, 12), animal = c("human", "human", "human", "dog", "human"))

df1 %>%
    pivot_wider(names_from = "animal", values_from = "fav.no")
## Warning: Values from `fav.no` are not uniquely identified; output will contain
## list-cols.
## • Use `values_fn = list` to suppress this warning.
## • Use `values_fn = {summary_fun}` to summarise duplicates.
## • Use the following dplyr code to identify duplicates.
##   {data} |>
##   dplyr::summarise(n = dplyr::n(), .by = c(name, animal)) |>
##   dplyr::filter(n > 1L)
# because saneesh is repeated twice but with two fav.nos the solution is to add
# a row id, make pivot wide and get rid of the row id
df1 %>%
    mutate(id = row_number()) %>%
    group_by(name) %>%
    pivot_wider(names_from = "animal", values_from = "fav.no", values_fill = 0) %>%
    select(-id)

Scoring numbers to likert

library(tidyverse)
numbers <- data.frame(test = seq(1:10))

numbers <-
  numbers %>% mutate(test1 = as.numeric(cut_number(test, 3)))
numbers <- numbers %>% mutate(test1 = as.factor(test1)) %>%
  mutate(test2 = recode(
    test1,
    "1" = "low",
    "2" = "medium",
    "3" = "high"
  ))

ggplot

sthda

add border to points

library(ggplot2)
ggplot(iris, aes(x = Petal.Length, y = Petal.Width, fill = Species), alpha = 0.07) +
    geom_point(size = 4, shape = 21, color = "black", stroke = 1.5)

df <- data.frame(dose = c("D0.5", "D1", "D2"), len = c(4.2, 10, 29.5))

bar plot

library(ggplot2)
# Basic barplot
p <- ggplot(data = df, aes(x = dose, y = len)) + geom_bar(stat = "identity")
p

# Horizontal bar plot p + coord_flip()
# Change the width of bars
ggplot(data = df, aes(x = dose, y = len)) + geom_bar(stat = "identity", width = 0.5)

# Change colors
ggplot(data = df, aes(x = dose, y = len)) + geom_bar(stat = "identity", color = "blue",
    fill = "white")

# Minimal theme + blue fill color
p <- ggplot(data = df, aes(x = dose, y = len)) + geom_bar(stat = "identity", fill = "steelblue") +
    theme_minimal()
p

labels

# out side the bars
p + geom_text(aes(label = len), vjust = -0.3, size = 3.5) + theme_minimal()

p + geom_text(aes(label = len), vjust = 1.6, color = "white", size = 3.5) + theme_minimal()

geom_vline

df <- data.frame(dose = c("D0.5", "D1", "D2", "pp", "kk", "rr"), len = c(4.2, 10,
    29.5, 12, 15, 23))
library(ggplot2)

ggplot(df, aes(len)) + geom_density() + geom_vline(aes(xintercept = mean(len)), col = "red",
    linetype = "dashed")

scatter plot with lm

library(ggplot2)

ggplot(iris, aes(Petal.Length, Petal.Width)) + geom_point() + geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'

raincloud plot

library(ggdist)
library(tidyverse)
library(tidyquant)
## Loading required package: PerformanceAnalytics
## Loading required package: xts
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## ######################### Warning from 'xts' package ##########################
## #                                                                             #
## # The dplyr lag() function breaks how base R's lag() function is supposed to  #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or       #
## # source() into this session won't work correctly.                            #
## #                                                                             #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop           #
## # dplyr from breaking base R's lag() function.                                #
## #                                                                             #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning.  #
## #                                                                             #
## ###############################################################################
## 
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
## 
##     first, last
## 
## Attaching package: 'PerformanceAnalytics'
## The following object is masked from 'package:graphics':
## 
##     legend
## Loading required package: quantmod
## Loading required package: TTR
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
mpg %>% filter(cyl %in% c(4, 6, 8)) %>%
  ggplot(aes(
    x = factor(cyl),
    y = hwy,
    fill = factor(cyl)
  )) +
  # add half violin from `ggdist` package
  ggdist::stat_halfeye(
    # custom bandwidth
    adjust = 0.5,
    # move geom to right
    justification = -0.2,
    # remove slab interval
    .width = 0,
    point_color = NA
  ) +
  # add boxplot
  geom_boxplot(width = 0.12,
    # remove outliers
    outlier.colour = NA,
    alpha = 0.5) +
  # add dot plots from `ggdist` package
  ggdist::stat_dots( # orientation of the plot
    side = "left",
    # move geom to the left
    justification = 1.1,
    # adjust grouping of observation
    binwidth = 0.25) +
  # adjust theme
  scale_fill_tq() +
  theme_tq() +
  labs(
    title = "raincloud plot",
    subtitle = "showing bimodel distribution of 6 cylinder  vehicles",
    x = "highway fuel efficiency",
    y = "cylinders"
  ) +
  coord_flip()

hex plot

library(tidyverse)
# install.packages("hexbin")
class <- c(rep("10th", 8))
students <- c("10 to 15",
  "15-20",
  "17 to 24",
  "20  to 25",
  "25 to 30",
  "30 to 40",
  "45 to 47",
  "50 to 55")
latitude <- c(
  11.50897246,
  11.48323136,
  11.48719031,
  11.46366611,
  11.41097322,
  11.52111154,
  11.44491386,
  11.46569568
)
longitude <- c(
  76.06032062,
  76.06192685,
  76.04266851,
  76.04156575,
  76.05075092,
  76.02846331,
  76.03084141,
  76.01766216
)
school <- data.frame(class, students, latitude, longitude)

school %>% mutate(students = parse_number(students)) %>%
  ggplot(aes(latitude, longitude, z = students)) +
  stat_summary_hex() +
  scale_fill_viridis_c(alpha = 0.8) +
  labs(fill = "students", title = "school students")
## Warning: Computation failed in `stat_summary_hex()`
## Caused by error in `compute_group()`:
## ! The package "hexbin" is required for `stat_summary_hex()`

Subscript and superscript

ggplot(iris, aes(x = Species, y = Sepal.Length)) + geom_boxplot() + labs(x = expression(text[subscript]),
    y = expression(text^superscript))

Two subtitles in two different positions in ggplot2

library(ggplot2)
library(dplyr, warn = FALSE)
iris %>%
    filter(Species != "setosa") %>%
    ggplot(aes(x = Petal.Length, y = Petal.Width)) + geom_point() + facet_wrap(~Species) +
    theme(strip.background.x = element_blank(), strip.text.x = element_text(hjust = 0,
        size = 11))

stat summary

income.data <- data.frame(Village = c(rep("Chittor", 20), rep("Bellari", 20)), Income = c(rnorm(n = 20,
    mean = 1000, sd = 150), rnorm(n = 20, mean = 1000, sd = 150)))
library(ggplot2)
ggplot(income.data, aes(Village, Income)) + geom_boxplot() + stat_summary(geom = "point",
    fun = mean, col = "red")

geom_density

income.data <- data.frame(Village = c(rep("Chittor", 20), rep("Bellari", 20)), Income = c(rnorm(n = 20,
    mean = 1000, sd = 150), rnorm(n = 20, mean = 1000, sd = 150)))
library(ggplot2)
ggplot(income.data) + geom_vline(aes(xintercept = mean(Income)), linetype = "dashed") +
    geom_density(aes(x = Income, color = Village)) + geom_vline(xintercept = 959,
    linetype = "dotted", col = "#f39c96") + geom_vline(xintercept = 1051, linetype = "dotted",
    col = "#00bfc4")

reorder axis

library(tidyverse)
# Using median
mpg %>%
    mutate(class = fct_reorder(class, hwy, .fun = "median")) %>%
    ggplot(aes(x = reorder(class, hwy), y = hwy, fill = class)) + geom_boxplot() +
    xlab("class") + theme(legend.position = "none") + xlab("")

pie chart

library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
data <- data.frame(category = c("Poaceae", "Fabaceae", "Asteraceae", "Acanthaceae",
    "Rubiaceae", "Euphorbiaceae", "Others"), count = c(18, 15, 8, 4, 4, 3, 17))

fig <- data %>%
    plot_ly(labels = ~category, values = ~count)
fig <- fig %>%
    add_pie(hole = 0.4) %>%
    layout(title = "Donut charts using Plotly", showlegend = T)

fig

barplot with error bar

# create dummy data
data <- data.frame(name = letters[1:5], value = sample(seq(4, 15), 5), sd = c(1,
    0.2, 3, 2, 4))

# Most basic error bar
library(viridis)
## Loading required package: viridisLite
ggplot(data) + geom_bar(aes(x = name, y = value), stat = "identity", fill = "skyblue",
    alpha = 0.7) + scale_fill_viridis_d() + geom_errorbar(aes(x = name, ymin = value -
    sd, ymax = value + sd), width = 0.4, colour = "orange", alpha = 0.9, linewidth = 1.3)

annotate

library(tidyverse)
df <- tribble(~gender,
  ~height,
  "male",
  12,
  "male",
  8,
  "female",
  11.5,
  "female",
  11)

ggplot(df, aes(gender, height)) +
  geom_point() +
  annotate(
    geom = "text",
    x = 1.29,
    y = 11.4,
    label = "short person",
    color = "red",
    size = 3,
    fontface = "italic"
  ) +
  annotate(
    geom = "segment",
    x = 1.05,
    # starting point on x, this decides length
    xend = 1.3,
    # end point on x, this decides length
    y = 11.02,
    # starting point on y
    yend = 11.3,
    # ending point on y
    color = "blue",
    linetype = "dashed"
  ) +
  annotate(
    geom = "segment",
    x = 1.95,
    # starting point on x, this decides length
    xend = 1.3,
    # end point on x, this decides length
    y = 8.2,
    # starting point on y
    yend = 11.3,
    # ending point on y
    color = "blue",
    linetype = "dashed"
  )

months

library(lubridate)
months <- seq(month(1:12))  # make moths
months <- month.abb[months]  # make abbriviations
temperature <- c(10, 12, 22, 32, 35, 30, 33, 28, 29, 25, 19, 14)
myframe <- data.frame(months, temperature)  # creating a new data frame

library(tidyverse)
glimpse(myframe)
## Rows: 12
## Columns: 2
## $ months      <chr> "Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "S…
## $ temperature <dbl> 10, 12, 22, 32, 35, 30, 33, 28, 29, 25, 19, 14
library(ggplot2)
ggplot(myframe, aes(x = months, y = temperature, group = 1)) + geom_line(col = "blue") +
    geom_point(col = "red") + ggtitle("Temperature of months") + scale_x_discrete(limits = month.abb)  # this will order months on the x axis

# create and view data frame
df <- data.frame(date = c("05/30/2021", "08/18/2021", "09/13/2021", "02/19/2021"),
    sales = c(3, 15, 14, 9))

df <- df %>%
    mutate(date = as.Date(date, format = "%m/%d/%Y")) %>%
    arrange(date)
df
p + scale_x_discrete(limits = c("D0.5", "D2"))
## Warning: Removed 1 rows containing missing values (`position_stack()`).

df2 <- data.frame(supp = rep(c("VC", "OJ"), each = 3), dose = rep(c("D0.5", "D1",
    "D2"), 2), len = c(6.8, 15, 33, 4.2, 10, 29.5))
p <- ggplot(data = df2, aes(x = dose, y = len, fill = supp)) + geom_bar(stat = "identity",
    position = position_dodge()) + geom_text(aes(label = len), vjust = 1.6, color = "white",
    position = position_dodge(0.9), size = 3.5) + scale_fill_brewer(palette = "Paired") +
    theme_minimal()
# Stacked barplot with multiple groups
ggplot(data = df2, aes(x = dose, y = len, fill = supp)) + geom_bar(stat = "identity")

# Use position=position_dodge()
ggplot(data = df2, aes(x = dose, y = len, fill = supp)) + geom_bar(stat = "identity",
    position = position_dodge())

# Change the colors manually
p <- ggplot(data = df2, aes(x = dose, y = len, fill = supp)) + geom_bar(stat = "identity",
    color = "black", position = position_dodge()) + theme_minimal()
# Use custom colors
p + scale_fill_manual(values = c("#999999", "#E69F00"))

# Use brewer color palettes
p + scale_fill_brewer(palette = "Blues")

Color Palettes

libraries

# install.packages('MetBrewer')
library(MetBrewer)

Plot the point plot using GDP per Capita as the x- axis and LE as the y axis. Numerical variable Population to control the size of each point.

plot <- gapminder %>%
    filter(year == 2007) %>%
    ggplot() + labs(x = "GDP per Capita", y = "Life Expectancy", color = "Population in millions",
    size = "Population in millions") + theme_minimal()

plot + geom_point(aes(gdpPercap, lifeExp, size = pop/1e+06))

To use color in the plot, assign the Population variable to the color aesthetic. Since nothing is specied, ggplot2 chooses a color spectrum for this numerical variable (shades of blue).

plot + geom_point(aes(gdpPercap, lifeExp, size = pop/1e+06, color = pop/1e+06))

To control the color spectrum, we need to introduce a color scale. In the following plot, we have to provide a vector of hex color values. You would choose this if you got your colors from one of the mentioned above websites.

plot + geom_point(aes(gdpPercap, lifeExp, size = pop/1e+06, color = pop/1e+06)) +
    scale_color_gradientn(colors = c("#003049", "#D62828", "#F77F00", "#FCBF49",
        "#EAE2B7"))

To apply one of the MetBrewer palettes, replace the hex-vector with a MetBrewer function. Within the function call, you provide the palette’s name, then several colors, and tell it that we need a continuous palette since it is a numerical variable.

plot + geom_point(aes(gdpPercap, lifeExp, size = pop/1e+06, color = pop/1e+06)) +
    scale_color_gradientn(colors = met.brewer("Cross", n = 500, type = "continuous"))

You might also want to use color palettes with non-numerical variables. Let us assume we want to apply color to the Continent variable. This implies using a manual color scale and providing a MetBrewer palette.

plot + geom_point(aes(gdpPercap, lifeExp, size = pop/1e+06, color = continent)) +
    scale_color_manual(values = met.brewer("Navajo", 5))

Please note if you want to apply color to the fill aesthetic rather than the color aesthetic, consider using the scale_fill_manuel function instead of the scale_color_manuel. This is useful for boxplots or bar charts.

gapminder %>%
    filter(gdpPercap < 60000) %>%
    ggplot(aes(continent, gdpPercap, color = year, fill = continent)) + geom_boxplot() +
    theme_minimal() + labs(x = "Continent", y = "GDP per Capita", fill = "Continent")
## Warning: The following aesthetics were dropped during statistical transformation: colour
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?

scale fill manual

themes

df <- data.frame(Names = as.factor(c("Bacteria", "Yeast", "None")), Quantity = c(2.5,
    5.5, 7.5))

library(ggplot2)
library(tidyverse)
df <- df %>%
    mutate(Names = fct_relevel(Names, c("Bacteria", "Yeast", "None")))

ggplot(df, aes(Names, Quantity, fill = Names)) + geom_bar(stat = "identity") + scale_fill_manual(values = c("#110a62",
    "#fcd749", "#b5b4b5")) + labs(y = "Necter pH", x = "Microbe added to nectar") +
    theme_classic() + theme(legend.position = "none", axis.ticks.x = element_blank()) +
    theme(axis.text = element_text(size = 22, color = "black")) + theme(axis.line.x = element_blank()) +
    theme(axis.ticks = element_line(size = 1, color = "black"), axis.ticks.length = unit(0.5,
        "cm")) + theme(text = element_text(size = 22))
## Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# ggThemeAssist::ggThemeAssistGadget(name of the plot)

graphics

x11()  # opne a new window for graphics
graphics.off()  # close the new window 

Normal distribution

Normal distribution, also known as the Gaussian distribution, is a probability distribution that is symmetric about the mean, showing that data near the mean are more frequent in occurrence than data far from the mean.

library(tidyverse)
n = 1000
mean = 170  # cm
sd = 6.35  # cm
binwidth = 0.3
set.seed(1234)
df <- data.frame(x = rnorm(n, mean, sd))
ggplot(df, aes(x = x, mean = mean, sd = sd, binwidth = binwidth, n = n)) + theme_bw() +
    geom_histogram(binwidth = binwidth, colour = "white", fill = "lightblue", size = 0.1) +
    stat_function(fun = function(x) dnorm(x, mean = mean, sd = sd) * n * binwidth,
        color = "darkred", linewidth = 1)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

wordcloud

library(googlesheets4)
library(dplyr)
library(wordcloud)
## Loading required package: RColorBrewer
## 
## Attaching package: 'wordcloud'
## The following object is masked from 'package:PerformanceAnalytics':
## 
##     textplot
library(RColorBrewer)

# gs4_auth()
path <- ("https://docs.google.com/spreadsheets/d/1ac8CuAQdRNXp9MjKsG7YWiHcT64tRgnCqlY9UhX-jEo/edit?usp=sharing")
test <- read_sheet(path)
## ! Using an auto-discovered, cached token.
##   To suppress this message, modify your code or options to clearly consent to
##   the use of a cached token.
##   See gargle's "Non-interactive auth" vignette for more details:
##   <https://gargle.r-lib.org/articles/non-interactive-auth.html>
## ℹ The googlesheets4 package is using a cached token for 'cssaneesh@gmail.com'.
## ✔ Reading from "wordcloud".
## ✔ Range 'Sheet1'.
head(test, 3)
test1 <- data.frame(test %>%
    select(topic) %>%
    count(topic) %>%
    mutate(count = n * 10))
head(test1, 3)
max(test1$count)
## [1] 90
set.seed(123)

wordcloud(words = test1$topic, freq = test1$count, min.freq = 10, max.words = 50,
    colors = brewer.pal(7, "BrBG"))

# export the file as .pdf

Legend

df <- data.frame(name = c("saneesh", "kishan", "anil", "mahi", "sanusha"), sex = c("male",
    "female", "male", "male", "female"), weight = c(60, 58, 65, 70, 48), favno = c(2,
    6, 10, 1, 15))

ggplot(df, aes(x = sex, y = weight, col = name, size = favno, shape = sex)) + geom_point()

# remove all legends
ggplot(df, aes(x = sex, y = weight, col = name, size = favno)) + geom_point() + theme(legend.position = "none")

# remove legend created by color
ggplot(df, aes(x = sex, y = weight, col = name, size = favno)) + geom_point() + guides(color = "none")

# remove legend created by shape
ggplot(df, aes(x = sex, y = weight, col = name, size = favno)) + geom_point() + guides(shape = "none")

# remove legend created by size
ggplot(df, aes(x = sex, y = weight, col = name, size = favno)) + geom_point() + guides(size = "none")

ggflowchart

# install.packages('ggflowchart')
library(ggflowchart)

data <- tibble::tibble(from = c("A", "A", "A", "B", "C", "F"), to = c("B", "C", "D",
    "E", "F", "G"))

ggflowchart(data)

talk blog

Functions

dice

dice <- c(1:6)

myluck <- function(x) {
    myluck <- sample(dice, size = 1, replace = T)
    return(myluck)
}

myluck()
## [1] 6

pick a name

names <- c("saneesh", "appu", "sanusha")
who <- function(x) {
    who <- sample(names, 1, T)
    return(who)
}

who()
## [1] "saneesh"

DAG

library(dagitty)
## 
## Attaching package: 'dagitty'
## The following object is masked from 'package:hablar':
## 
##     convert
sapling <- dagitty("dag{
    Treatment-> RCD <- Livestock;
    Trench -> RCD
}")
coordinates(sapling) <- list(x = c(Treatment = 1, Livestock = 2, Trench = 2, RCD = 2  # column 2
), y = c(Treatment = 0, RCD = 0, Livestock = -1, Trench = 1))

# Treatment=1 column 1 Livestock= 2, column 2 Trench= 2, column 2 RCD=2 column
# 2

# Treatment=0, middle row/0 RCD=0, middle row/0 Livestock= -1, above middle row
# -1 Trench= 1 below the middle row/1

plot(sapling)

function to split

df <- data.frame(name = as.factor(c("James Bond", "Spider Man", "Iron Man")))
# df <- df %>% separate(name, c('Genus', 'Species'), sep = '([ ])')

shorten <- function(df) {
    name_split <- df %>%
        separate(name, c("Genus", "Species"), sep = "([ ])")
    print(name_split)
}

shorten(df)
##    Genus Species
## 1  James    Bond
## 2 Spider     Man
## 3   Iron     Man

Model

Model with interaction

Model interaction

library(ggplot2)
library(dplyr)

data <- data.frame(sex = rep(c("male", "female"), each = 20), weight = c(rnorm(20,
    mean = 65, 5), rnorm(20, mean = 55, 5)), height = c(rnorm(20, mean = 165, 6),
    rnorm(20, mean = 152, sd = 6)))


# Plot the interaction using ggplot2
data %>%
    ggplot(aes(x = height, y = weight, color = sex)) + geom_point() + geom_smooth(method = "lm",
    se = FALSE)
## `geom_smooth()` using formula = 'y ~ x'

web scraping

library(rvest)
## 
## Attaching package: 'rvest'
## The following object is masked from 'package:readr':
## 
##     guess_encoding
# page <-
# read_html('https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_population')
# tables <- html_table(page) typeof(tables) unlist(tables) table2 <-
# as.data.frame(tables[[2]]) head(table2,2)

Rmarkdown

knitr golbal options

to apply to every chunk in the file

inside the chunk write knitr::opts_chunk$set(include= ,echo = , message= , warning= )

# knitr::opts_chunk$set(message = TRUE, echo = TRUE, warning = TRUE)

include: to show or hide code and results from appearing
echo: to show or hide code in the output but shows result
message to hide or show the messages generated by the code
warning: to show or hide warning generated by the code

these options can be written for individual chunks as well

## [1] 5

headings

1 # heading 1
2 ## heading 2 3 ### heading 3

italics
italic

bold
bold

plot() to show r code/function
@Saneesh

blockquotes are writtedn after >

this is a blockquote
— Saneesh

plain code

hello

unordered items

  • item 1
  • item 2
    • sub item 1a
    • sub item 2b

ordered items

  1. Item 1
  2. Item 2
    • Item 2a # give two spaces before the +
    • Item 2b

writing mathematical functions

adding image and caption

write

write

Inside a chunk after three … r, echo=FALSE,out.width="70%",fig.align="center",fig.cap='write' close the curly bracket, then write knitr::include_graphics(“Idly.jpg”) # keep the image in the project folder, then close the chunk. with ‘```’

Idly
Idly

write an exclamation mark !, then square brackets [caption] write caption in it, the normal brackets (Idly.jpg) write the name of the file and it’s extension i.e., idly.jpg

Resources

bbcplot
colorhunt
colors
colorpaletts
colorpaletts
coloradobe
colormind
datavizpyr
datatoviz
Cédric Scherer
ggplottheme
mycolor
viz-palette
Intro to r